import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsTransformer, KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In this article, we compare a number of classification methods for the breast cancer dataset. The details regarding this dataset can be found in Diagnostic Wisconsin Breast Cancer Database. We would use the following classification methods and then compare them in terms of performance.
Throughout this website, there are a large number of methods that discuss these methods. Here, we will not discuss these methods and only apply them. Interested readers are encouraged to see Statistical Learning.
data = load_breast_cancer()
df = pd.DataFrame(data['data'], columns = data['feature_names'])
Temp = [x.title() for x in data['target_names'].tolist()]
df['Target'] = data['target']
df['Diagnosis'] = df['Target'].map(lambda x: Temp[1] if x == 1 else Temp[0])
del Temp
display(df)
| mean radius | mean texture | mean perimeter | mean area | mean smoothness | mean compactness | mean concavity | mean concave points | mean symmetry | mean fractal dimension | ... | worst perimeter | worst area | worst smoothness | worst compactness | worst concavity | worst concave points | worst symmetry | worst fractal dimension | Target | Diagnosis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.30010 | 0.14710 | 0.2419 | 0.07871 | ... | 184.60 | 2019.0 | 0.16220 | 0.66560 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | 0 | Malignant |
| 1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.08690 | 0.07017 | 0.1812 | 0.05667 | ... | 158.80 | 1956.0 | 0.12380 | 0.18660 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | 0 | Malignant |
| 2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.19740 | 0.12790 | 0.2069 | 0.05999 | ... | 152.50 | 1709.0 | 0.14440 | 0.42450 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | 0 | Malignant |
| 3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.24140 | 0.10520 | 0.2597 | 0.09744 | ... | 98.87 | 567.7 | 0.20980 | 0.86630 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | 0 | Malignant |
| 4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.19800 | 0.10430 | 0.1809 | 0.05883 | ... | 152.20 | 1575.0 | 0.13740 | 0.20500 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | 0 | Malignant |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 564 | 21.56 | 22.39 | 142.00 | 1479.0 | 0.11100 | 0.11590 | 0.24390 | 0.13890 | 0.1726 | 0.05623 | ... | 166.10 | 2027.0 | 0.14100 | 0.21130 | 0.4107 | 0.2216 | 0.2060 | 0.07115 | 0 | Malignant |
| 565 | 20.13 | 28.25 | 131.20 | 1261.0 | 0.09780 | 0.10340 | 0.14400 | 0.09791 | 0.1752 | 0.05533 | ... | 155.00 | 1731.0 | 0.11660 | 0.19220 | 0.3215 | 0.1628 | 0.2572 | 0.06637 | 0 | Malignant |
| 566 | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | 0.1590 | 0.05648 | ... | 126.70 | 1124.0 | 0.11390 | 0.30940 | 0.3403 | 0.1418 | 0.2218 | 0.07820 | 0 | Malignant |
| 567 | 20.60 | 29.33 | 140.10 | 1265.0 | 0.11780 | 0.27700 | 0.35140 | 0.15200 | 0.2397 | 0.07016 | ... | 184.60 | 1821.0 | 0.16500 | 0.86810 | 0.9387 | 0.2650 | 0.4087 | 0.12400 | 0 | Malignant |
| 568 | 7.76 | 24.54 | 47.92 | 181.0 | 0.05263 | 0.04362 | 0.00000 | 0.00000 | 0.1587 | 0.05884 | ... | 59.16 | 268.6 | 0.08996 | 0.06444 | 0.0000 | 0.0000 | 0.2871 | 0.07039 | 1 | Benign |
569 rows × 32 columns
As can be seen, the number of instances is 569 and the number of attributes is 32. The object of the exercise is to create a classification model that can classify the type of Diagnosis base on the rest of the attributes. However, first, let's plot a count plot for Diagnosis attribute.
Temp = df.groupby(['Diagnosis'])['Diagnosis'].agg({'count'}).reset_index(drop = False).rename(columns ={'count': 'Count'})
Temp['Percentage'] = np.round(100* Temp['Count'].values /Temp['Count'].sum(), 2)
# display(Temp.style.hide_index())
fig = px.bar(Temp, y= 'Diagnosis', x= 'Percentage', orientation='h', text = 'Count', color_discrete_sequence= ['Bisque'],
height= 220)
fig.update_traces(marker_line_color= 'DarkRed', marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(title = 'Diagnosis Distribution', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
| Set | X_train | X_test | y_train | y_test |
|---|---|---|---|---|
| Shape | (426, 30) | (143, 30) | (426,) | (143,) |
The k-neighbors classification is the most commonly used classification techniques. Please see K-Nearest Neighbors from Statistical Learning, and this link for more details.
# Neighbors List
n_neighbors_list = list(np.arange(1,11,1))
# Transforming X into a (weighted) graph of k nearest neighbors
graph_model = KNeighborsTransformer(n_neighbors=max(n_neighbors_list), mode='distance')
# KNeighborsClassifier
classifier_model = KNeighborsClassifier(metric='precomputed')
# Making a pipline
full_model = Pipeline(steps=[('graph', graph_model), ('classifier', classifier_model)])
# Paramter Grid
param_grid = {'classifier__n_neighbors': n_neighbors_list}
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(full_model, param_grid)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(13, 6.0))
# Left
_ = ax[0].errorbar(x=n_neighbors_list,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='n_neighbors', title='Classification accuracy')
# Right
_ = ax[1].errorbar(x=n_neighbors_list,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='n_neighbors', title='Fit time (with caching)')
fig.tight_layout()
del graph_model, classifier_model, full_model, param_grid, ax
| Best Score | Best Paramerers | Accuracy |
|---|---|---|
| 0.929400 | {'classifier__n_neighbors': 8} | 0.958000 |
| rank_test_score | params | mean_test_score |
|---|---|---|
| 1 | {'classifier__n_neighbors': 8} | 0.9294 |
| 2 | {'classifier__n_neighbors': 10} | 0.9271 |
| 3 | {'classifier__n_neighbors': 7} | 0.9247 |
| 4 | {'classifier__n_neighbors': 6} | 0.9224 |
| 5 | {'classifier__n_neighbors': 9} | 0.9224 |
| 6 | {'classifier__n_neighbors': 3} | 0.9201 |
| 7 | {'classifier__n_neighbors': 5} | 0.9154 |
| 8 | {'classifier__n_neighbors': 2} | 0.9131 |
| 9 | {'classifier__n_neighbors': 4} | 0.9107 |
| 10 | {'classifier__n_neighbors': 1} | 0.9060 |
Logistic regression utilizes a logistic function for a classification model. Please see Logistic Regression from Statistical Learning, and this link for more details.
# regularization strength
Regularization_Strength = [10.0**x for x in range(4)]
# Inverse of regularization strength
C = [1/x for x in Regularization_Strength]
# Parameters
param_grid = {'tol': [10.0**x for x in np.arange(-2, -5, -1)], 'C': C,}
# Logistic Regression
logistic = LogisticRegression(max_iter=10000)
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(logistic, param_grid, n_jobs=-1)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(13, 6.0))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, grid_model, logistic
| Best Score | Best Paramerers | Accuracy |
|---|---|---|
| 0.948300 | {'C': 1.0, 'tol': 0.01} | 0.972000 |
| rank_test_score | params | mean_test_score |
|---|---|---|
| 1 | {'C': 1.0, 'tol': 0.01} | 0.9483 |
| 1 | {'C': 1.0, 'tol': 0.001} | 0.9483 |
| 1 | {'C': 1.0, 'tol': 0.0001} | 0.9483 |
| 4 | {'C': 0.1, 'tol': 0.01} | 0.9436 |
| 4 | {'C': 0.1, 'tol': 0.001} | 0.9436 |
| 4 | {'C': 0.1, 'tol': 0.0001} | 0.9436 |
| 7 | {'C': 0.01, 'tol': 0.01} | 0.9343 |
| 7 | {'C': 0.01, 'tol': 0.001} | 0.9343 |
| 7 | {'C': 0.01, 'tol': 0.0001} | 0.9343 |
| 10 | {'C': 0.001, 'tol': 0.01} | 0.9271 |
| 10 | {'C': 0.001, 'tol': 0.001} | 0.9271 |
| 10 | {'C': 0.001, 'tol': 0.0001} | 0.9271 |
We also can utilize principal component analysis (PCA) and logistic regression for unsupervised dimensionality reduction and he prediction.
param_grid = {'pca__n_components': [2, 5, 10, 15, 25, 30], 'logistic__C': np.logspace(-4, 4, 4),}
# Logistic Regression
logistic = LogisticRegression(max_iter=10000, tol=0.1)
# Principle Component Analysis
pca = PCA()
# Making a pipline
full_model = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(full_model, param_grid, n_jobs=-1)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(13, 6.0))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, full_model, grid_model, PCA, logistic
| Best Score | Best Paramerers | Accuracy |
|---|---|---|
| 0.964800 | {'logistic__C': 21.54434690031882, 'pca__n_components': 25} | 0.958000 |
| rank_test_score | params | mean_test_score |
|---|---|---|
| 1 | {'logistic__C': 21.54434690031882, 'pca__n_components': 30} | 0.9648 |
| 1 | {'logistic__C': 21.54434690031882, 'pca__n_components': 25} | 0.9648 |
| 3 | {'logistic__C': 10000.0, 'pca__n_components': 15} | 0.9624 |
| 4 | {'logistic__C': 10000.0, 'pca__n_components': 30} | 0.9601 |
| 5 | {'logistic__C': 10000.0, 'pca__n_components': 10} | 0.9601 |
| 5 | {'logistic__C': 21.54434690031882, 'pca__n_components': 15} | 0.9601 |
| 7 | {'logistic__C': 21.54434690031882, 'pca__n_components': 10} | 0.9554 |
| 7 | {'logistic__C': 10000.0, 'pca__n_components': 25} | 0.9554 |
| 9 | {'logistic__C': 0.046415888336127774, 'pca__n_components': 5} | 0.9437 |
| 10 | {'logistic__C': 10000.0, 'pca__n_components': 5} | 0.9413 |
| 10 | {'logistic__C': 21.54434690031882, 'pca__n_components': 5} | 0.9413 |
| 10 | {'logistic__C': 0.046415888336127774, 'pca__n_components': 30} | 0.9413 |
| 10 | {'logistic__C': 0.046415888336127774, 'pca__n_components': 15} | 0.9413 |
| 10 | {'logistic__C': 0.046415888336127774, 'pca__n_components': 10} | 0.9413 |
| 10 | {'logistic__C': 0.046415888336127774, 'pca__n_components': 25} | 0.9413 |
| 16 | {'logistic__C': 21.54434690031882, 'pca__n_components': 2} | 0.9130 |
| 16 | {'logistic__C': 0.046415888336127774, 'pca__n_components': 2} | 0.9130 |
| 16 | {'logistic__C': 10000.0, 'pca__n_components': 2} | 0.9130 |
| 19 | {'logistic__C': 0.0001, 'pca__n_components': 30} | 0.9106 |
| 19 | {'logistic__C': 0.0001, 'pca__n_components': 25} | 0.9106 |
| 19 | {'logistic__C': 0.0001, 'pca__n_components': 15} | 0.9106 |
| 19 | {'logistic__C': 0.0001, 'pca__n_components': 10} | 0.9106 |
| 19 | {'logistic__C': 0.0001, 'pca__n_components': 5} | 0.9106 |
| 24 | {'logistic__C': 0.0001, 'pca__n_components': 2} | 0.9083 |
Decision Tree Classifier (DTC) is a classifier that uses a bootstrap aggregating. See sklearn.tree.DecisionTreeClassifier for more details.
# Parameters
param_grid = {'criterion':['gini','entropy'], 'max_depth': np.arange(2,14)}
# Logistic Regression
dtc = DecisionTreeClassifier()
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(dtc, param_grid, n_jobs=-1)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(13, 6.0))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, grid_model
| Best Score | Best Paramerers | Accuracy |
|---|---|---|
| 0.936600 | {'criterion': 'entropy', 'max_depth': 11} | 0.958000 |
| rank_test_score | params | mean_test_score |
|---|---|---|
| 1 | {'criterion': 'entropy', 'max_depth': 11} | 0.9366 |
| 2 | {'criterion': 'entropy', 'max_depth': 4} | 0.9342 |
| 3 | {'criterion': 'entropy', 'max_depth': 13} | 0.9318 |
| 3 | {'criterion': 'gini', 'max_depth': 3} | 0.9318 |
| 5 | {'criterion': 'entropy', 'max_depth': 3} | 0.9295 |
| 6 | {'criterion': 'entropy', 'max_depth': 9} | 0.9295 |
| 7 | {'criterion': 'gini', 'max_depth': 5} | 0.9248 |
| 8 | {'criterion': 'entropy', 'max_depth': 7} | 0.9248 |
| 9 | {'criterion': 'entropy', 'max_depth': 2} | 0.9248 |
| 10 | {'criterion': 'gini', 'max_depth': 6} | 0.9225 |
| 10 | {'criterion': 'gini', 'max_depth': 11} | 0.9225 |
| 12 | {'criterion': 'gini', 'max_depth': 4} | 0.9225 |
| 12 | {'criterion': 'entropy', 'max_depth': 12} | 0.9225 |
| 14 | {'criterion': 'gini', 'max_depth': 2} | 0.9225 |
| 15 | {'criterion': 'entropy', 'max_depth': 10} | 0.9202 |
| 15 | {'criterion': 'gini', 'max_depth': 7} | 0.9202 |
| 15 | {'criterion': 'gini', 'max_depth': 13} | 0.9202 |
| 18 | {'criterion': 'entropy', 'max_depth': 6} | 0.9202 |
| 18 | {'criterion': 'gini', 'max_depth': 8} | 0.9202 |
| 20 | {'criterion': 'gini', 'max_depth': 10} | 0.9201 |
| 21 | {'criterion': 'gini', 'max_depth': 12} | 0.9178 |
| 22 | {'criterion': 'entropy', 'max_depth': 5} | 0.9177 |
| 23 | {'criterion': 'entropy', 'max_depth': 8} | 0.9131 |
| 24 | {'criterion': 'gini', 'max_depth': 9} | 0.9131 |
# Parameters
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
# Support Vector Machine
svm = SVC(kernel='rbf', class_weight='balanced')
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(svm, param_grid)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(13, 6.0))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, grid_model, svm
| Best Score | Best Paramerers | Accuracy |
|---|---|---|
| 0.922500 | {'C': 1000.0, 'gamma': 0.0005} | 0.923100 |
| rank_test_score | params | mean_test_score |
|---|---|---|
| 1 | {'C': 1000.0, 'gamma': 0.0005} | 0.9225 |
| 1 | {'C': 50000.0, 'gamma': 0.0005} | 0.9225 |
| 1 | {'C': 10000.0, 'gamma': 0.0005} | 0.9225 |
| 1 | {'C': 100000.0, 'gamma': 0.0005} | 0.9225 |
| 1 | {'C': 5000.0, 'gamma': 0.0005} | 0.9225 |
| 6 | {'C': 1000.0, 'gamma': 0.0001} | 0.9178 |
| 6 | {'C': 10000.0, 'gamma': 0.0001} | 0.9178 |
| 6 | {'C': 100000.0, 'gamma': 0.0001} | 0.9178 |
| 6 | {'C': 5000.0, 'gamma': 0.0001} | 0.9178 |
| 6 | {'C': 50000.0, 'gamma': 0.0001} | 0.9178 |
| 11 | {'C': 10000.0, 'gamma': 0.005} | 0.9107 |
| 11 | {'C': 5000.0, 'gamma': 0.005} | 0.9107 |
| 11 | {'C': 50000.0, 'gamma': 0.005} | 0.9107 |
| 11 | {'C': 1000.0, 'gamma': 0.005} | 0.9107 |
| 11 | {'C': 100000.0, 'gamma': 0.005} | 0.9107 |
| 16 | {'C': 50000.0, 'gamma': 0.001} | 0.9037 |
| 16 | {'C': 100000.0, 'gamma': 0.001} | 0.9037 |
| 16 | {'C': 10000.0, 'gamma': 0.001} | 0.9037 |
| 16 | {'C': 5000.0, 'gamma': 0.001} | 0.9037 |
| 16 | {'C': 1000.0, 'gamma': 0.001} | 0.9037 |
| 21 | {'C': 10000.0, 'gamma': 0.01} | 0.6338 |
| 21 | {'C': 100000.0, 'gamma': 0.01} | 0.6338 |
| 21 | {'C': 5000.0, 'gamma': 0.01} | 0.6338 |
| 21 | {'C': 50000.0, 'gamma': 0.01} | 0.6338 |
| 21 | {'C': 1000.0, 'gamma': 0.01} | 0.6338 |
| 26 | {'C': 5000.0, 'gamma': 0.1} | 0.6291 |
| 26 | {'C': 50000.0, 'gamma': 0.1} | 0.6291 |
| 26 | {'C': 1000.0, 'gamma': 0.1} | 0.6291 |
| 26 | {'C': 10000.0, 'gamma': 0.1} | 0.6291 |
| 26 | {'C': 100000.0, 'gamma': 0.1} | 0.6291 |
A random forest classifier (RFC) fits several decision tree classifiers on (using sub-samples of the dataset) and then averages them to improve the predictive accuracy. See sklearn.ensemble.RandomForestClassifier for more details.
# Parameters
param_grid = {'n_estimators': [n*100 for n in [2**m for m in np.arange(0,2)]],
'max_depth': list(np.arange(2,4)),
'min_samples_leaf': [10.0**x for x in np.arange(-1,-4,-1)]},
# Random Forest Classifier
rfc = RandomForestClassifier()
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(rfc, param_grid)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(13, 6.0))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, grid_model, rfc
| Best Score | Best Paramerers | Accuracy |
|---|---|---|
| 0.950700 | {'max_depth': 3, 'min_samples_leaf': 0.001, 'n_estimators': 100} | 0.972000 |
| rank_test_score | params | mean_test_score |
|---|---|---|
| 1 | {'max_depth': 3, 'min_samples_leaf': 0.001, 'n_estimators': 100} | 0.9507 |
| 2 | {'max_depth': 3, 'min_samples_leaf': 0.01, 'n_estimators': 200} | 0.9460 |
| 3 | {'max_depth': 3, 'min_samples_leaf': 0.001, 'n_estimators': 200} | 0.9437 |
| 4 | {'max_depth': 2, 'min_samples_leaf': 0.01, 'n_estimators': 100} | 0.9437 |
| 4 | {'max_depth': 2, 'min_samples_leaf': 0.001, 'n_estimators': 200} | 0.9437 |
| 4 | {'max_depth': 3, 'min_samples_leaf': 0.01, 'n_estimators': 100} | 0.9437 |
| 7 | {'max_depth': 2, 'min_samples_leaf': 0.001, 'n_estimators': 100} | 0.9390 |
| 8 | {'max_depth': 2, 'min_samples_leaf': 0.01, 'n_estimators': 200} | 0.9390 |
| 9 | {'max_depth': 3, 'min_samples_leaf': 0.1, 'n_estimators': 200} | 0.9342 |
| 10 | {'max_depth': 3, 'min_samples_leaf': 0.1, 'n_estimators': 100} | 0.9272 |
| 11 | {'max_depth': 2, 'min_samples_leaf': 0.1, 'n_estimators': 200} | 0.9272 |
| 12 | {'max_depth': 2, 'min_samples_leaf': 0.1, 'n_estimators': 100} | 0.9225 |
Gradient Boosting Classifier (GBC)optimizes a model in several stages using differentiable loss function. See sklearn.ensemble.GradientBoostingClassifier for more details.
# Parameters
param_grid = {'loss': ['deviance', 'exponential'],
'learning_rate': [0.1, 0.2, 0.3],
'n_estimators': [100, 200],
'subsample': [0.5, 1.0]}
# Gradient Boosting Classifier
gbc = GradientBoostingClassifier()
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(gbc, param_grid, n_jobs=-1)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(13, 6.0))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, grid_model, gbc
| Best Score | Best Paramerers | Accuracy |
|---|---|---|
| 0.969400 | {'learning_rate': 0.3, 'loss': 'exponential', 'n_estimators': 200, 'subsample': 0.5} | 0.965000 |
| rank_test_score | params | mean_test_score |
|---|---|---|
| 1 | {'learning_rate': 0.3, 'loss': 'exponential', 'n_estimators': 200, 'subsample': 0.5} | 0.9694 |
| 2 | {'learning_rate': 0.1, 'loss': 'exponential', 'n_estimators': 100, 'subsample': 0.5} | 0.9648 |
| 3 | {'learning_rate': 0.3, 'loss': 'deviance', 'n_estimators': 200, 'subsample': 0.5} | 0.9647 |
| 3 | {'learning_rate': 0.1, 'loss': 'deviance', 'n_estimators': 200, 'subsample': 0.5} | 0.9647 |
| 3 | {'learning_rate': 0.1, 'loss': 'exponential', 'n_estimators': 200, 'subsample': 0.5} | 0.9647 |
| 3 | {'learning_rate': 0.2, 'loss': 'deviance', 'n_estimators': 200, 'subsample': 0.5} | 0.9647 |
| 7 | {'learning_rate': 0.2, 'loss': 'deviance', 'n_estimators': 100, 'subsample': 0.5} | 0.9647 |
| 8 | {'learning_rate': 0.2, 'loss': 'exponential', 'n_estimators': 200, 'subsample': 0.5} | 0.9624 |
| 9 | {'learning_rate': 0.2, 'loss': 'exponential', 'n_estimators': 100, 'subsample': 0.5} | 0.9624 |
| 10 | {'learning_rate': 0.1, 'loss': 'deviance', 'n_estimators': 100, 'subsample': 0.5} | 0.9601 |
| 11 | {'learning_rate': 0.3, 'loss': 'exponential', 'n_estimators': 100, 'subsample': 0.5} | 0.9600 |
| 11 | {'learning_rate': 0.3, 'loss': 'deviance', 'n_estimators': 100, 'subsample': 0.5} | 0.9600 |
| 13 | {'learning_rate': 0.3, 'loss': 'deviance', 'n_estimators': 200, 'subsample': 1.0} | 0.9577 |
| 14 | {'learning_rate': 0.2, 'loss': 'exponential', 'n_estimators': 100, 'subsample': 1.0} | 0.9554 |
| 15 | {'learning_rate': 0.3, 'loss': 'exponential', 'n_estimators': 100, 'subsample': 1.0} | 0.9553 |
| 15 | {'learning_rate': 0.2, 'loss': 'exponential', 'n_estimators': 200, 'subsample': 1.0} | 0.9553 |
| 15 | {'learning_rate': 0.2, 'loss': 'deviance', 'n_estimators': 100, 'subsample': 1.0} | 0.9553 |
| 18 | {'learning_rate': 0.1, 'loss': 'deviance', 'n_estimators': 200, 'subsample': 1.0} | 0.9530 |
| 18 | {'learning_rate': 0.3, 'loss': 'deviance', 'n_estimators': 100, 'subsample': 1.0} | 0.9530 |
| 20 | {'learning_rate': 0.3, 'loss': 'exponential', 'n_estimators': 200, 'subsample': 1.0} | 0.9530 |
| 20 | {'learning_rate': 0.1, 'loss': 'exponential', 'n_estimators': 200, 'subsample': 1.0} | 0.9530 |
| 22 | {'learning_rate': 0.1, 'loss': 'exponential', 'n_estimators': 100, 'subsample': 1.0} | 0.9507 |
| 23 | {'learning_rate': 0.2, 'loss': 'deviance', 'n_estimators': 200, 'subsample': 1.0} | 0.9507 |
| 24 | {'learning_rate': 0.1, 'loss': 'deviance', 'n_estimators': 100, 'subsample': 1.0} | 0.9483 |
This model optimizes the log-loss function using LBFGS or stochastic gradient descent. See sklearn.neural_network.MLPClassifier.
# Parameters
param_grid = {'solver': ['lbfgs', 'sgd', 'adam'],
'alpha': [10.0**x for x in np.arange(-1,-4,-1)],
'learning_rate' : ['constant', 'invscaling', 'adaptive']}
# Multi-layer Perceptron classifier
mlp = MLPClassifier(max_iter = 1000)
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(mlp, param_grid, n_jobs=-1)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(13, 6.0))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, grid_model
| Best Score | Best Paramerers | Accuracy |
|---|---|---|
| 0.953100 | {'alpha': 0.001, 'learning_rate': 'constant', 'solver': 'lbfgs'} | 0.958000 |
| rank_test_score | params | mean_test_score |
|---|---|---|
| 1 | {'alpha': 0.001, 'learning_rate': 'constant', 'solver': 'lbfgs'} | 0.9531 |
| 2 | {'alpha': 0.1, 'learning_rate': 'invscaling', 'solver': 'lbfgs'} | 0.9483 |
| 3 | {'alpha': 0.01, 'learning_rate': 'adaptive', 'solver': 'adam'} | 0.9436 |
| 4 | {'alpha': 0.1, 'learning_rate': 'invscaling', 'solver': 'adam'} | 0.9342 |
| 5 | {'alpha': 0.1, 'learning_rate': 'constant', 'solver': 'lbfgs'} | 0.9295 |
| 6 | {'alpha': 0.001, 'learning_rate': 'invscaling', 'solver': 'adam'} | 0.9295 |
| 7 | {'alpha': 0.01, 'learning_rate': 'invscaling', 'solver': 'adam'} | 0.9271 |
| 8 | {'alpha': 0.001, 'learning_rate': 'adaptive', 'solver': 'adam'} | 0.9249 |
| 9 | {'alpha': 0.1, 'learning_rate': 'adaptive', 'solver': 'adam'} | 0.9224 |
| 10 | {'alpha': 0.01, 'learning_rate': 'constant', 'solver': 'adam'} | 0.9224 |
| 11 | {'alpha': 0.1, 'learning_rate': 'constant', 'solver': 'adam'} | 0.9153 |
| 12 | {'alpha': 0.1, 'learning_rate': 'adaptive', 'solver': 'lbfgs'} | 0.8762 |
| 13 | {'alpha': 0.01, 'learning_rate': 'adaptive', 'solver': 'lbfgs'} | 0.8715 |
| 14 | {'alpha': 0.001, 'learning_rate': 'invscaling', 'solver': 'lbfgs'} | 0.8708 |
| 15 | {'alpha': 0.01, 'learning_rate': 'invscaling', 'solver': 'sgd'} | 0.8472 |
| 16 | {'alpha': 0.001, 'learning_rate': 'adaptive', 'solver': 'lbfgs'} | 0.8283 |
| 17 | {'alpha': 0.01, 'learning_rate': 'constant', 'solver': 'lbfgs'} | 0.8189 |
| 18 | {'alpha': 0.01, 'learning_rate': 'invscaling', 'solver': 'lbfgs'} | 0.8150 |
| 19 | {'alpha': 0.1, 'learning_rate': 'adaptive', 'solver': 'sgd'} | 0.8071 |
| 20 | {'alpha': 0.001, 'learning_rate': 'constant', 'solver': 'adam'} | 0.7977 |
| 21 | {'alpha': 0.001, 'learning_rate': 'invscaling', 'solver': 'sgd'} | 0.7732 |
| 22 | {'alpha': 0.1, 'learning_rate': 'invscaling', 'solver': 'sgd'} | 0.7718 |
| 23 | {'alpha': 0.001, 'learning_rate': 'adaptive', 'solver': 'sgd'} | 0.7350 |
| 24 | {'alpha': 0.01, 'learning_rate': 'adaptive', 'solver': 'sgd'} | 0.7256 |
| 25 | {'alpha': 0.001, 'learning_rate': 'constant', 'solver': 'sgd'} | 0.6768 |
| 26 | {'alpha': 0.1, 'learning_rate': 'constant', 'solver': 'sgd'} | 0.6401 |
| 27 | {'alpha': 0.01, 'learning_rate': 'constant', 'solver': 'sgd'} | 0.5121 |
it seems that Gradient Boosting Classifier performing slightly better than the rest of the classification method in this study. All of these classification methods are tuned in a way that performs at their best by implementing GridSearchCV.